Pokemon data analysis with PCA

We would like to know which Pokemons have similar species strength each other and principal component analysis (PCA) can help us address this issue.

The following files are available on https://github.com/v0369012/Pokemon_PCA.

# Read the Pokemon value table (1st generation ~ 7th generation)
PKM_values_7 <- readLines("Pokemon_list_g7_TC.txt", encoding = "UTF-8")

To simplify the analysis, we removed Pokemons with special form, like Mega, Alolan…

# Loading packages
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Pokemons without special form have 8 elements per line
PKM_without_spf <- str_count(PKM_values_7, "\\|") == 8
PKM_values_7_without_spf <- PKM_values_7[PKM_without_spf]
PKM_values_7_without_spf_split <- str_split(PKM_values_7_without_spf, "\\|")
# Make a Pokemon table
PKM_values_7_without_spf_df <- data.frame(
  Number = unlist(PKM_values_7_without_spf_split)[seq(2, 792*9, 9)],
  Name = unlist(PKM_values_7_without_spf_split)[seq(3, 792*9, 9)],
  generation = c(rep(1, 151-0), rep(2, 251-151), rep(3, 385-251), rep(4, 490-385), rep(5, 640-490), rep(6, 707-640), rep(7, 792-707)),
  HP = unlist(PKM_values_7_without_spf_split)[seq(4, 792*9, 9)] %>% as.character() %>% as.numeric(),
  ATK = unlist(PKM_values_7_without_spf_split)[seq(5, 792*9, 9)] %>% as.character() %>% as.numeric(),
  DEF = unlist(PKM_values_7_without_spf_split)[seq(6, 792*9, 9)] %>% as.character() %>% as.numeric(),
  SATK = unlist(PKM_values_7_without_spf_split)[seq(7, 792*9, 9)] %>% as.character() %>% as.numeric(),
  SDEF = unlist(PKM_values_7_without_spf_split)[seq(8, 792*9, 9)] %>% as.character() %>% as.numeric(),
  SPEED = unlist(PKM_values_7_without_spf_split)[seq(9, 792*9, 9)] %>% str_replace_all("\\}","") %>% as.character() %>% as.numeric()
)

# Check the table
head(PKM_values_7_without_spf_df)
##   Number     Name generation HP ATK DEF SATK SDEF SPEED
## 1    001 妙蛙種子          1 45  49  49   65   65    45
## 2    002   妙蛙草          1 60  62  63   80   80    60
## 3    003   妙蛙花          1 80  82  83  100  100    80
## 4    004   小火龍          1 39  52  43   60   50    65
## 5    005   火恐龍          1 58  64  58   80   65    80
## 6    006   噴火龍          1 78  84  78  109   85   100
# Address Pokemon types table
PKM_types_7 <- readLines("Pokemon_types.txt")

# Extract the Pokemon number
PKM_types_7_number <- c()
for (i in 1:876) {
  PKM_types_7_number[i] <- str_split(PKM_types_7, "\\|")[[i]][[3]]
}

# Extract the Pokemon names
PKM_types_7_name <- c()
for (i in 1:876) {
  PKM_types_7_name[i] <- str_split(PKM_types_7, "\\|")[[i]][[4]]
}

# We chose Pokemon first types for the analysis
PKM_types_7_types1 <- c()
for (i in 1:876) {
  PKM_types_7_types1[i] <- str_split(PKM_types_7, "\\|")[[i]][[6]] %>% str_remove_all("\\}")
}

# Make Pokemon types 1 table
PKM_types_7_df <- data.frame(
  Number = PKM_types_7_number,
  Name = PKM_types_7_name,
  types1 = PKM_types_7_types1
)

# Check the table
head(PKM_types_7_df)
##   Number       Name types1
## 1    001  Bulbasaur  Grass
## 2    002    Ivysaur  Grass
## 3    003   Venusaur  Grass
## 4    004 Charmander   Fire
## 5    005 Charmeleon   Fire
## 6    006  Charizard   Fire
# Remove Pokemon numbers containing letters
position_without_letters <- str_detect(PKM_types_7_number, "^[0-9]*$")
PKM_types_7_number_without_letters <- PKM_types_7_number[position_without_letters]

# Remove number with letters
PKM_types_7_df_t <- filter(PKM_types_7_df, Number %in% PKM_types_7_number_without_letters)
# Merge Pokemon table and types table by number
PKM_merged_df <- merge(PKM_types_7_df_t, PKM_values_7_without_spf_df, by = "Number")


# Check the correlation between the variables
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
head(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])))
##         Var1       Var2      value
## 1 generation generation 1.00000000
## 2         HP generation 0.06737277
## 3        ATK generation 0.12726340
## 4        DEF generation 0.09396861
## 5       SATK generation 0.09037036
## 6       SDEF generation 0.06661734
# Plot heatmap
ggplot(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])),
       aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient2(low = "firebrick4", high = "steelblue",
                       mid = "white", midpoint = 0) +
  guides(fill=guide_legend(title="Correlation")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        axis.title = element_blank())

# PCA
pca.model <- prcomp(PKM_merged_df[, 5:ncol(PKM_merged_df)], T)

# Show pca summary
summary(pca.model)
## Importance of components:
##                            PC1     PC2     PC3     PC4      PC5      PC6
## Standard deviation     45.3407 30.5622 26.2864 22.8902 18.60125 14.52452
## Proportion of Variance  0.4314  0.1960  0.1450  0.1100  0.07261  0.04427
## Cumulative Proportion   0.4314  0.6274  0.7724  0.8823  0.95496  0.99922
##                            PC7
## Standard deviation     1.92186
## Proportion of Variance 0.00078
## Cumulative Proportion  1.00000
pca.model$rotation
##                    PC1          PC2          PC3          PC4          PC5
## generation 0.005173058  0.003138396 -0.002254479  0.001608106  0.007858027
## HP         0.365911209 -0.014089097 -0.126630236  0.795808412 -0.333377213
## ATK        0.464159122  0.093172879 -0.714863910 -0.058629494  0.166243167
## DEF        0.419466963  0.647724808  0.047193615 -0.369365737  0.034422515
## SATK       0.463676640 -0.390824439  0.339767984  0.089206914  0.706333101
## SDEF       0.420806046  0.127751437  0.577626506 -0.039023568 -0.399765049
## SPEED      0.287423468 -0.634427982 -0.146994523 -0.466194387 -0.448617477
##                     PC6           PC7
## generation  0.005800753  0.9999301595
## HP         -0.324688598  0.0010893090
## ATK         0.483426571 -0.0083220613
## DEF        -0.514447494 -0.0007887328
## SATK       -0.099614896 -0.0055224474
## SDEF        0.558222907 -0.0013096278
## SPEED      -0.273221981  0.0060330843
# Make a pca table to plot
p1_p2_table <- pca.model$x[,1:2] %>% as.data.frame()
rownames(p1_p2_table) <- PKM_merged_df[,1]
legend_number <- c(144:146, 150:151, 
                   243:245, 249:251,
                   377:386,
                   479:494,
                   638:649,
                   716:721,
                   785:809
)
legend <- rep(F, nrow(p1_p2_table))
legend_position <- which(PKM_merged_df[, "Number"] %in% legend_number)
legend[legend_position] <- rep(T, length(legend_position))

p1_p2_table_t <- cbind(p1_p2_table, 
                     Number = PKM_merged_df[, "Number"],
                     Name = PKM_merged_df[, "Name.y"],
                     generation = PKM_merged_df["generation"],
                     types1 = PKM_merged_df[, "types1"],
                     legend = legend
)

p1_p2_table_t[, "generation"] <- as.character(p1_p2_table_t[, "generation"])

# Visualization
pca_gg <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name))+
  geom_point(size = 2.5)
# label with generation
pca_gg_generation <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name, color = generation))+
  geom_point(size = 2.5)
# label with first types
pca_gg_types1 <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name, color = types1))+
  geom_point(size = 2.5)
# label with legend Pokemon
pca_gg_legend <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Name, color = legend))+
  geom_point(size = 2.5)

# User-interactive visualization
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
pca_ggly <- ggplotly(pca_gg)
pca_ggly
pca_ggly_generation <- ggplotly(pca_gg_generation)
pca_ggly_generation
pca_ggly_types1 <- ggplotly(pca_gg_types1)
pca_ggly_types1
pca_ggly_legend <- ggplotly(pca_gg_legend)
pca_ggly_legend